set.seed(2021)
prev_thresh <- 0.1 # remove ASVs with prevalence less than that
samples_prep_path <- "./"
data_path <- "./DADA2_pseudo/"
# Metadata_table <- "./AnCUE_Metadata_decontam.csv"
# Seq_table <- "DADA2.seqtab_nochim_decontam.tsv"
# Seq_file <- "DADA2.Seqs_decontam.fa"
Ps_file <- "Ps_obj_decontam.Rds"
Seq_file <- "DADA2.Seqs_decontam.fa"Also remove controls
readRDS(paste0(data_path, Ps_file)) ->
Ps_obj
Ps_obj %>%
subset_samples(., Hours == 0) ->
Ps_obj_t0
Ps_obj %>%
subset_samples(., Hours != 0 & !is.na(Site)) ->
Ps_obj_SIP Combine repeated runs
sample_data(Ps_obj_SIP) %<>%
as("data.frame") %>%
rownames_to_column() %>%
mutate(Identifier = paste(Site, Oxygen, Glucose, Label..13C., Hours, Fraction.no.,
sep = "_")) %>%
column_to_rownames("rowname")
phyloseq_merge_samples(Ps_obj_SIP, "Identifier") %>%
filter_taxa(., function(x) sum(x) > 0, TRUE) ->
Ps_obj_SIP_merged
# Compute lib sizes
sample_data(Ps_obj_SIP_merged)$Lib.size <- rowSums(otu_table(Ps_obj_SIP_merged))First let’s look at the count data distribution
I will test now the effect of library size and all other experimental factors on the community composition and also plot
(mod1 <- adonis(vegdist(otu_table(Ps_obj_SIP_merged), method = "bray") ~ Site * Oxygen * Hours + Lib.size,
data = as(sample_data(Ps_obj_SIP_merged), "data.frame"),
permutations = 999
))##
## Call:
## adonis(formula = vegdist(otu_table(Ps_obj_SIP_merged), method = "bray") ~ Site * Oxygen * Hours + Lib.size, data = as(sample_data(Ps_obj_SIP_merged), "data.frame"), permutations = 999)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Site 1 22.555 22.5553 314.924 0.39126 0.001 ***
## Oxygen 1 6.377 6.3767 89.033 0.11061 0.001 ***
## Hours 1 0.333 0.3332 4.652 0.00578 0.002 **
## Lib.size 1 4.943 4.9430 69.016 0.08575 0.001 ***
## Site:Oxygen 1 1.483 1.4830 20.706 0.02573 0.001 ***
## Site:Hours 1 0.471 0.4707 6.572 0.00817 0.002 **
## Oxygen:Hours 1 0.281 0.2809 3.922 0.00487 0.005 **
## Site:Oxygen:Hours 1 0.363 0.3631 5.070 0.00630 0.001 ***
## Residuals 291 20.842 0.0716 0.36154
## Total 299 57.648 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Modelling library size shows a significant effect of read depth on the community structure, but explaining only 9% of the variance. The reads histogram shows as expected a highly sparse and skewed sequence matrix. The mean vs SD also shows as expected large dependency of SD on the mean reads of a sequence across all samples.
Frist let’s look at the taxonomic distribution
##
## Archaea Bacteria
## 185 14280
##
## Abditibacteria Acidimicrobiia Acidobacteriae
## 21 679 1199
## Actinobacteria AD3 Alphaproteobacteria
## 784 27 2031
## Anaerolineae Armatimonadia Babeliae
## 3 84 680
## Bacilli Bacteroidia BD7-11
## 459 503 16
## Bdellovibrionia Blastocatellia Campylobacteria
## 110 20 5
## Chlamydiae Chloroflexia Chthonomonadetes
## 583 4 63
## Clostridia Coriobacteriia Cyanobacteriia
## 276 4 178
## Dehalococcoidia Deinococci Desulfitobacteriia
## 1 10 12
## Desulfobulbia Desulfovibrionia Elusimicrobia
## 1 2 14
## Endomicrobia Fibrobacteria Fimbriimonadia
## 2 5 58
## Fusobacteriia Gammaproteobacteria Gemmatimonadetes
## 21 2250 25
## Gitt-GS-136 Gracilibacteria Halobacteria
## 1 1 1
## Holophagae Hydrogenedentia JG30-KF-CM66
## 2 1 25
## Kapabacteria KD4-96 Ktedonobacteria
## 10 1 82
## Lentisphaeria Lineage IIa Longimicrobia
## 2 5 2
## Methanobacteria Micrarchaeia Myxococcia
## 1 4 111
## Negativicutes Nitrososphaeria Oligoflexia
## 13 129 106
## OM190 Omnitrophia Parcubacteria
## 1 11 11
## Phycisphaerae Planctomycetes Polyangia
## 154 1703 146
## Rhodothermia S0134 terrestrial group Saccharimonadia
## 2 1 49
## Sericytochromatia SJA-28 Spirochaetia
## 1 1 1
## Subgroup 22 Subgroup 5 Syntrophobacteria
## 1 12 1
## Thermoanaerobaculia Thermoleophilia Thermoplasmata
## 2 223 50
## TK10 Unclassified vadinHA49
## 10 615 11
## Vampirivibrionia Verrucomicrobiae Vicinamibacteria
## 206 599 27
Now let’s remove some taxa which are obvious artefacts or those which aren’t bacteria or archaea
kingdoms2remove <- c("", "Eukaryota", "Unclassified")
orders2remove <- c("Chloroplast")
families2remove <- c("Mitochondria")
Ps_obj_kingdoms <- tax_glom(Ps_obj_SIP_merged, "Kingdom")
Ps_obj_orders <- tax_glom(Ps_obj_SIP_merged, "Order")
Ps_obj_families <- tax_glom(Ps_obj_SIP_merged, "Family")
Ps_obj_SIP_merged_filt <- subset_taxa(Ps_obj_SIP_merged, !is.na(Phylum) &
!Kingdom %in% kingdoms2remove &
!Order %in% orders2remove &
!Family %in% families2remove)Summary_pruned <- tibble(
Level = c("Kingdom", "Order", "Family"),
ASVs.removed = c(
table(tax_table(Ps_obj)[, "Kingdom"], exclude = NULL) %>% as.data.frame() %>% .[.$Var1 == "" | .$Var1 == "Eukaryota" | .$Var1 == "Unclassified", 2] %>% sum(),
table(tax_table(Ps_obj)[, "Order"], exclude = NULL) %>% as.data.frame() %>% .[.$Var1 == "Chloroplast", 2] %>% sum(),
table(tax_table(Ps_obj)[, "Family"], exclude = NULL) %>% as.data.frame() %>% .[.$Var1 == "Mitochondria", 2] %>% sum()
),
Seqs.removed = c(
psmelt(Ps_obj_kingdoms) %>%
group_by(Kingdom) %>%
filter(Kingdom == "" |
Kingdom == "Eukaryota" | Kingdom == "Unclassified") %>%
summarise(sum = sum(Abundance)) %>% .$sum %>% sum(),
psmelt(Ps_obj_orders) %>%
group_by(Order) %>%
filter(Order == orders2remove) %>%
summarise(sum = sum(Abundance)) %>% .$sum %>% sum(),
psmelt(Ps_obj_families) %>%
group_by(Family) %>%
filter(Family == families2remove) %>%
summarise(sum = sum(Abundance)) %>% .$sum %>% sum()
)
)
Summary_pruned %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Level | ASVs.removed | Seqs.removed |
|---|---|---|
| Kingdom | 0 | 0 |
| Order | 136 | 4294 |
| Family | 138 | 15725 |
Removed 0.0335% of the sequences.
Now let’s explore the prevalence of different taxa in the database. Prevalence is the number of samples in which a taxa appears at least once. So “Mean prevalence” refers to in how many samples does a sequence belonging to the phylum appears on average, and “Sum prevalence” is the sum of all samples where any sequence from the taxon appears.
prevdf <- apply(X = otu_table(Ps_obj_SIP_merged_filt),
MARGIN = ifelse(taxa_are_rows(Ps_obj_SIP_merged_filt), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(Ps_obj_SIP_merged_filt),
tax_table(Ps_obj_SIP_merged_filt))
prevdf %>%
group_by(Phylum) %>%
summarise(`Mean prevalence` = mean(Prevalence),
`Sum prevalence` = sum(Prevalence)) ->
Prevalence_phylum_summary
Prevalence_phylum_summary %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Phylum | Mean prevalence | Sum prevalence |
|---|---|---|
| Abditibacteriota | 21.3 | 448 |
| Acidobacteriota | 58.2 | 73619 |
| Actinobacteriota | 51.3 | 86754 |
| Armatimonadota | 43.5 | 9178 |
| Bacteroidota | 20.9 | 10783 |
| Bdellovibrionota | 7.5 | 1625 |
| Campilobacterota | 3.6 | 18 |
| Chloroflexi | 29.5 | 4632 |
| Crenarchaeota | 38.9 | 5012 |
| Cyanobacteria | 17.5 | 4458 |
| Deinococcota | 1.5 | 15 |
| Dependentiae | 11.7 | 7981 |
| Desulfobacterota | 6.7 | 40 |
| Elusimicrobiota | 13.0 | 273 |
| Euryarchaeota | 1.0 | 1 |
| FCPU426 | 44.6 | 1606 |
| Fibrobacterota | 9.0 | 45 |
| Firmicutes | 22.1 | 18329 |
| Fusobacteriota | 4.4 | 93 |
| GAL15 | 40.1 | 281 |
| Gemmatimonadota | 16.7 | 468 |
| Halobacterota | 1.0 | 1 |
| Hydrogenedentes | 1.0 | 1 |
| Micrarchaeota | 1.5 | 6 |
| Myxococcota | 46.7 | 11996 |
| Patescibacteria | 9.0 | 588 |
| Planctomycetota | 36.0 | 67790 |
| Proteobacteria | 28.2 | 119347 |
| RCP2-54 | 105.0 | 8924 |
| SAR324 clade(Marine group B) | 33.0 | 33 |
| Spirochaetota | 1.0 | 1 |
| Thermoplasmatota | 49.3 | 2464 |
| Unclassified | 10.3 | 1825 |
| Verrucomicrobiota | 29.7 | 35502 |
| WPS-2 | 63.1 | 8456 |
prevdf %>%
group_by(Order) %>%
summarise(`Mean prevalence` = mean(Prevalence),
`Sum prevalence` = sum(Prevalence)) ->
Prevalence_order_summary
Prevalence_order_summary %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Order | Mean prevalence | Sum prevalence |
|---|---|---|
| 0319-6G20 | 8.2 | 655 |
| 11-24 | 7.5 | 15 |
| Abditibacteriales | 21.3 | 448 |
| Absconditabacteriales (SR1) | 2.0 | 2 |
| Acetobacterales | 69.1 | 16435 |
| Acholeplasmatales | 1.0 | 1 |
| Acidaminococcales | 1.0 | 1 |
| Acidimicrobiales | 1.0 | 1 |
| Acidobacteriales | 51.6 | 30887 |
| Actinomycetales | 3.5 | 38 |
| Aeromonadales | 1.0 | 540 |
| Alicyclobacillales | 26.6 | 186 |
| Alteromonadales | 1.0 | 1 |
| Ardenticatenales | 1.0 | 2 |
| Armatimonadales | 43.8 | 3676 |
| Azospirillales | 38.2 | 992 |
| B12-WMSP1 | 1.0 | 1 |
| Babeliales | 11.7 | 7981 |
| Bacillales | 35.0 | 3151 |
| Bacteriovoracales | 1.2 | 5 |
| Bacteroidales | 2.0 | 117 |
| Bacteroidetes VC2.1 Bac22 | 1.5 | 3 |
| Bdellovibrionales | 6.5 | 687 |
| Bifidobacteriales | 1.0 | 3 |
| Blastocatellales | 1.3 | 22 |
| Blfdi19 | 22.5 | 45 |
| Bryobacterales | 76.7 | 11734 |
| Burkholderiales | 23.8 | 6741 |
| Caedibacterales | 11.7 | 82 |
| Caldalkalibacillales | 3.5 | 7 |
| Campylobacterales | 3.6 | 18 |
| Candidatus Jorgensenbacteria | 3.8 | 23 |
| Candidatus Kaiserbacteria | 1.0 | 1 |
| Cardiobacteriales | 4.3 | 13 |
| Catenulisporales | 97.8 | 2151 |
| Caulobacterales | 57.1 | 9421 |
| Cellvibrionales | 1.0 | 2 |
| Chitinophagales | 34.7 | 5136 |
| Chlamydiales | 17.1 | 9989 |
| Chloroflexales | 1.0 | 1 |
| Christensenellales | 1.4 | 18 |
| Chthoniobacterales | 26.0 | 4752 |
| Chthonomonadales | 19.0 | 1199 |
| Clostridia UCG-014 | 1.0 | 2 |
| Clostridiales | 16.8 | 606 |
| Coriobacteriales | 1.2 | 5 |
| Corynebacteriales | 41.8 | 5059 |
| Coxiellales | 13.5 | 3232 |
| Cyanobacteriales | 3.9 | 126 |
| Cytophagales | 5.1 | 471 |
| Deinococcales | 1.5 | 15 |
| Desulfitobacteriales | 23.2 | 279 |
| Desulfobulbales | 1.0 | 1 |
| Desulfovibrionales | 1.0 | 2 |
| Diplorickettsiales | 10.1 | 2318 |
| Dongiales | 2.0 | 2 |
| Elev-1554 | 14.5 | 29 |
| Elev-16S-1166 | 1.0 | 1 |
| Elsterales | 57.4 | 19463 |
| Endomicrobiales | 1.0 | 2 |
| Enterobacterales | 4.5 | 150 |
| Entomoplasmatales | 8.2 | 41 |
| Erysipelotrichales | 6.7 | 100 |
| Exiguobacterales | 1.0 | 3 |
| FCPU453 | 32.5 | 130 |
| Fibrobacterales | 9.0 | 45 |
| Fimbriimonadales | 70.2 | 4072 |
| Flavobacteriales | 2.6 | 124 |
| Frankiales | 83.3 | 28396 |
| Fusobacteriales | 4.4 | 93 |
| Gaiellales | 71.8 | 933 |
| Gammaproteobacteria Incertae Sedis | 32.0 | 6088 |
| Gastranaerophilales | 1.0 | 3 |
| Gemmatales | 34.4 | 49765 |
| Gemmatimonadales | 18.6 | 465 |
| Group 1.1c | 39.2 | 3335 |
| Haliangiales | 37.5 | 488 |
| Halobacterales | 1.0 | 1 |
| Holosporales | 22.9 | 1280 |
| Hungateiclostridiaceae | 1.0 | 2 |
| Hydrogenedentiales | 1.0 | 1 |
| I3A | 10.6 | 53 |
| IMCC26256 | 37.0 | 7954 |
| Isosphaerales | 55.7 | 5624 |
| JG36-TzT-191 | 82.3 | 3375 |
| Kapabacteriales | 38.3 | 383 |
| KF-JG30-C25 | 20.0 | 60 |
| Kineosporiales | 1.0 | 5 |
| Ktedonobacterales | 27.6 | 2232 |
| Lachnospirales | 5.6 | 767 |
| Lactobacillales | 14.4 | 1209 |
| Legionellales | 19.7 | 3175 |
| Leptolyngbyales | 1.0 | 1 |
| Lineage IV | 10.9 | 98 |
| Longimicrobiales | 1.0 | 2 |
| Methanobacteriales | 1.0 | 1 |
| Methanomassiliicoccales | 1.0 | 2 |
| Methylacidiphilales | 33.8 | 2163 |
| Methylococcales | 9.0 | 18 |
| Micavibrionales | 57.0 | 57 |
| Micrarchaeales | 1.5 | 6 |
| Micrococcales | 7.4 | 891 |
| Micromonosporales | 1.5 | 3 |
| Micropepsales | 56.0 | 4645 |
| Microtrichales | 2.1 | 27 |
| mle1-27 | 11.0 | 44 |
| Monoglobales | 1.0 | 4 |
| Myxococcales | 22.2 | 2464 |
| Nitrosococcales | 37.0 | 74 |
| Nitrososphaerales | 60.5 | 1150 |
| Nitrosotaleales | 25.6 | 436 |
| Obscuribacterales | 21.2 | 4153 |
| Oceanospirillales | 2.7 | 8 |
| Oligoflexales | 16.6 | 216 |
| Omnitrophales | 25.5 | 281 |
| Opitutales | 38.4 | 3957 |
| Oscillospirales | 4.6 | 146 |
| Oxyphotobacteria Incertae Sedis | 1.0 | 2 |
| Paenibacillales | 53.1 | 8012 |
| Paracaedibacterales | 11.6 | 792 |
| Pasteurellales | 14.1 | 127 |
| Pedosphaerales | 70.4 | 13805 |
| Peptococcales | 1.0 | 1 |
| Peptostreptococcales-Tissierellales | 6.4 | 256 |
| Phormidesmiales | 1.0 | 2 |
| Phycisphaerales | 21.0 | 524 |
| Pirellulales | 34.5 | 4455 |
| Planctomycetales | 85.8 | 2230 |
| Polyangiales | 70.8 | 8783 |
| Propionibacteriales | 10.5 | 221 |
| Pseudanabaenales | 1.0 | 1 |
| Pseudomonadales | 15.1 | 1782 |
| Pseudonocardiales | 36.8 | 699 |
| Pyrinomonadales | 1.0 | 1 |
| Reyranellales | 29.3 | 352 |
| Rhizobiales | 62.3 | 23562 |
| Rhodobacterales | 2.6 | 84 |
| Rhodospirillales | 13.8 | 1658 |
| Rhodothermales | 1.0 | 2 |
| Rickettsiales | 15.6 | 1485 |
| S-BQ2-57 soil group | 11.8 | 177 |
| S085 | 1.0 | 1 |
| Saccharimonadales | 11.0 | 537 |
| Salinisphaerales | 48.2 | 1013 |
| SAR11 clade | 1.0 | 1 |
| SBR1031 | 1.0 | 1 |
| Silvanigrellales | 4.8 | 62 |
| Solibacterales | 89.8 | 10771 |
| Solirubrobacterales | 75.9 | 15946 |
| Sphingobacteriales | 29.4 | 4468 |
| Sphingomonadales | 4.6 | 705 |
| Spirochaetales | 1.0 | 1 |
| Staphylococcales | 21.6 | 410 |
| Steroidobacterales | 1.0 | 1 |
| Streptomycetales | 63.1 | 883 |
| Streptosporangiales | 2.9 | 105 |
| Subgroup 12 | 10.8 | 43 |
| Subgroup 13 | 61.6 | 1725 |
| Subgroup 15 | 5.1 | 72 |
| Subgroup 2 | 73.8 | 16985 |
| Subgroup 7 | 1.0 | 2 |
| Synechococcales | 1.0 | 2 |
| Syntrophobacterales | 35.0 | 35 |
| Tepidisphaerales | 35.1 | 4533 |
| Thalassobaculales | 1.0 | 1 |
| Thermicanales | 5.0 | 10 |
| Thermoactinomycetales | 30.0 | 1951 |
| Thermoanaerobaculales | 15.5 | 31 |
| Thermomicrobiales | 1.0 | 1 |
| TSBb06 | 10.2 | 61 |
| Unclassified | 31.5 | 56145 |
| Vampirovibrionales | 17.3 | 104 |
| Veillonellales-Selenomonadales | 6.7 | 80 |
| Verrucomicrobiales | 10.1 | 375 |
| Vibrionales | 1.7 | 5 |
| Vicinamibacterales | 31.3 | 846 |
| Victivallales | 1.0 | 2 |
| WD260 | 158.8 | 3335 |
| Xanthomonadales | 29.6 | 2336 |
Based on that I’ll remove all orders with a sum prevalence of under 5% (15) of all samples
Prevalence_order_summary %>%
filter(`Sum prevalence` < (0.05 * nsamples(Ps_obj_SIP_merged_filt))) %>%
dplyr::select(Order) %>%
map(as.character) %>%
unlist() ->
filterOrder
Ps_obj_SIP_merged_filt2 <- subset_taxa(Ps_obj_SIP_merged_filt, !Order %in% filterOrder)
sample_data(Ps_obj_SIP_merged_filt2)$Lib.size <- rowSums(otu_table(Ps_obj_SIP_merged_filt2))
print(Ps_obj_SIP_merged_filt)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 14199 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 14199 taxa by 6 taxonomic ranks ]:
## taxa are columns
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 14099 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 14099 taxa by 6 taxonomic ranks ]:
## taxa are columns
This removed 100 or 1% of the ESVs, and 0.046% of the reads.
Plot general prevalence features of the phyla
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf,
Phylum %in% get_taxa_unique(Ps_obj_SIP_merged_filt2,
"Phylum"))
ggplot(prevdf_phylum_filt,
aes(TotalAbundance,
Prevalence / nsamples(Ps_obj_SIP_merged_filt2),
color = Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) +
geom_point2(size = 2, alpha = 0.7) +
scale_x_log10() +
xlab("Total Abundance") +
ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Phylum) +
theme(legend.position = "none")Plot general prevalence features of the top 20 orders
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf,
Order %in% get_taxa_unique(Ps_obj_SIP_merged_filt2, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>%
group_by(Order) %>%
summarise(Combined.abundance = sum(TotalAbundance)) %>%
arrange(desc(Combined.abundance)) %>%
.[1:30, "Order"] ->
Orders2plot
prevdf_order_filt2 <- subset(prevdf,
Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
aes(TotalAbundance,
Prevalence / nsamples(Ps_obj_SIP_merged_filt2),
color = Order)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) +
geom_point2(size = 2, alpha = 0.7) +
scale_x_log10() +
xlab("Total Abundance") +
ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Order) +
theme(legend.position = "none")I’ll remove all sequences which appear in less than 5% of the samples
# Define prevalence threshold as 5% of total samples
prevalenceThreshold <- prev_thresh * nsamples(Ps_obj_SIP_merged_filt2)
prevalenceThreshold## [1] 30
# Execute prevalence filter, using `prune_taxa()` function
keepTaxa <-
row.names(prevdf_phylum_filt)[(prevdf_phylum_filt$Prevalence >= prevalenceThreshold)]
Ps_obj_SIP_merged_filt3 <- prune_taxa(keepTaxa, Ps_obj_SIP_merged_filt2)
sample_data(Ps_obj_SIP_merged_filt3)$Lib.size <- rowSums(otu_table(Ps_obj_SIP_merged_filt3))
print(Ps_obj_SIP_merged_filt2)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 14099 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 14099 taxa by 6 taxonomic ranks ]:
## taxa are columns
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 3693 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 3693 taxa by 6 taxonomic ranks ]:
## taxa are columns
This removed 10406 or 74% of the ESVs!
However all these removed ESVs accounted for only:
prevdf_phylum_filt %>%
arrange(., Prevalence) %>%
group_by(Prevalence > prevalenceThreshold) %>%
summarise(Abundance = sum(TotalAbundance)) %>%
mutate(`Rel. Ab.` = percent(Abundance / sum(Abundance), accuracy = 0.01)) %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Prevalence > prevalenceThreshold | Abundance | Rel. Ab. |
|---|---|---|
| FALSE | 303010 | 2.37% |
| TRUE | 12498417 | 97.63% |
So it’s fine to remove them.
Test again the effect of library size and all other experimental factors on the community composition after filtering
(mod1 <- adonis(vegdist(otu_table(Ps_obj_SIP_merged_filt3), method = "bray") ~ Site * Oxygen * Hours + Lib.size,
data = as(sample_data(Ps_obj_SIP_merged_filt3), "data.frame"),
permutations = 999
))##
## Call:
## adonis(formula = vegdist(otu_table(Ps_obj_SIP_merged_filt3), method = "bray") ~ Site * Oxygen * Hours + Lib.size, data = as(sample_data(Ps_obj_SIP_merged_filt3), "data.frame"), permutations = 999)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Site 1 22.638 22.6379 342.15 0.40294 0.001 ***
## Oxygen 1 6.358 6.3575 96.09 0.11316 0.001 ***
## Hours 1 0.322 0.3223 4.87 0.00574 0.005 **
## Lib.size 1 5.069 5.0686 76.61 0.09022 0.001 ***
## Site:Oxygen 1 1.475 1.4747 22.29 0.02625 0.001 ***
## Site:Hours 1 0.458 0.4576 6.92 0.00814 0.001 ***
## Oxygen:Hours 1 0.264 0.2643 3.99 0.00470 0.007 **
## Site:Oxygen:Hours 1 0.345 0.3448 5.21 0.00614 0.002 **
## Residuals 291 19.254 0.0662 0.34270
## Total 299 56.181 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
saveRDS(Ps_obj_SIP_merged_filt3, file = paste0(data_path, str_remove(Ps_file, ".Rds"), "_filt3.Rds"))
readDNAStringSet(paste0(data_path, Seq_file)) %>%
.[taxa_names(Ps_obj_SIP_merged_filt3)] %>%
writeXStringSet(., filepath = paste0(data_path, str_remove(Seq_file, ".fa*"), "_filtered.fa"), format = "fasta", width = 1000) Current session info
─ Session info ─────────────────────────────────────────────────────────────────────────
setting value
version R version 4.0.3 (2020-10-10)
os Ubuntu 18.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Prague
date 2021-02-11
─ Packages ─────────────────────────────────────────────────────────────────────────────
package * version date lib source
ade4 1.7-16 2020-10-28 [1] CRAN (R 4.0.2)
affy 1.66.0 2020-04-27 [1] Bioconductor
affyio 1.58.0 2020-04-27 [1] Bioconductor
ape 5.4-1 2020-08-13 [1] CRAN (R 4.0.2)
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.2)
backports 1.2.1 2020-12-09 [1] CRAN (R 4.0.2)
bayestestR 0.8.2 2021-01-26 [1] CRAN (R 4.0.3)
Biobase * 2.48.0 2020-04-27 [1] Bioconductor
BiocGenerics * 0.34.0 2020-04-27 [1] Bioconductor
BiocManager 1.30.10 2019-11-16 [1] CRAN (R 4.0.2)
biomformat 1.16.0 2020-04-27 [1] Bioconductor
Biostrings * 2.56.0 2020-04-27 [1] Bioconductor
broom 0.7.4 2021-01-29 [1] CRAN (R 4.0.3)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.2)
cli 2.3.0 2021-01-31 [1] CRAN (R 4.0.3)
clipr 0.7.1 2020-10-08 [1] CRAN (R 4.0.2)
cluster 2.1.0 2019-06-19 [1] CRAN (R 4.0.2)
codetools 0.2-18 2020-11-04 [1] CRAN (R 4.0.2)
colorspace 2.0-0 2020-11-11 [1] CRAN (R 4.0.2)
crayon 1.4.1 2021-02-08 [1] CRAN (R 4.0.3)
data.table 1.13.6 2020-12-30 [1] CRAN (R 4.0.2)
DBI 1.1.1 2021-01-15 [1] CRAN (R 4.0.3)
dbplyr 2.1.0 2021-02-03 [1] CRAN (R 4.0.3)
desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.2)
details 0.2.1 2020-01-12 [1] CRAN (R 4.0.2)
digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.2)
dplyr * 1.0.4 2021-02-02 [1] CRAN (R 4.0.3)
effectsize 0.4.3 2021-01-18 [1] CRAN (R 4.0.3)
ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.2)
evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.2)
extrafont * 0.17 2014-12-08 [1] CRAN (R 4.0.2)
extrafontdb 1.0 2012-06-11 [1] CRAN (R 4.0.2)
farver 2.0.3 2020-01-16 [1] CRAN (R 4.0.2)
forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.0.3)
foreach 1.5.1 2020-10-15 [1] CRAN (R 4.0.2)
fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
gdtools 0.2.3 2021-01-06 [1] CRAN (R 4.0.2)
generics 0.1.0 2020-10-31 [1] CRAN (R 4.0.2)
ggplot2 * 3.3.3 2020-12-30 [1] CRAN (R 4.0.2)
ggridges 0.5.3 2021-01-08 [1] CRAN (R 4.0.2)
glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.2)
haven 2.3.1 2020-06-01 [1] CRAN (R 4.0.2)
hexbin 1.28.2 2021-01-08 [1] CRAN (R 4.0.2)
highr 0.8 2019-03-20 [1] CRAN (R 4.0.2)
hms 1.0.0 2021-01-13 [1] CRAN (R 4.0.3)
htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2)
igraph 1.2.6 2020-10-06 [1] CRAN (R 4.0.2)
insight 0.12.0 2021-01-14 [1] CRAN (R 4.0.3)
IRanges * 2.22.2 2020-05-21 [1] Bioconductor
iterators 1.0.13 2020-10-15 [1] CRAN (R 4.0.2)
jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.2)
kableExtra * 1.3.1 2020-10-22 [1] CRAN (R 4.0.2)
knitr 1.31 2021-01-27 [1] CRAN (R 4.0.3)
labeling 0.4.2 2020-10-20 [1] CRAN (R 4.0.2)
lattice * 0.20-41 2020-04-02 [1] CRAN (R 4.0.2)
lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.2)
limma 3.44.3 2020-06-12 [1] Bioconductor
lubridate 1.7.9.2 2020-11-13 [1] CRAN (R 4.0.2)
magrittr * 2.0.1 2020-11-17 [1] CRAN (R 4.0.2)
MASS 7.3-53 2020-09-09 [1] CRAN (R 4.0.2)
Matrix 1.3-2 2021-01-06 [1] CRAN (R 4.0.2)
mgcv 1.8-33 2020-08-27 [1] CRAN (R 4.0.2)
modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.2)
multtest 2.44.0 2020-04-27 [1] Bioconductor
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.2)
nlme 3.1-152 2021-02-04 [1] CRAN (R 4.0.3)
parameters 0.11.0 2021-01-15 [1] CRAN (R 4.0.3)
permute * 0.9-5 2019-03-12 [1] CRAN (R 4.0.2)
phyloseq * 1.32.0 2020-04-27 [1] Bioconductor
pillar 1.4.7 2020-11-20 [1] CRAN (R 4.0.2)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.2)
plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.2)
png 0.1-7 2013-12-03 [1] CRAN (R 4.0.2)
preprocessCore 1.50.0 2020-04-27 [1] Bioconductor
prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.2)
progress 1.2.2 2019-05-16 [1] CRAN (R 4.0.2)
purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.2)
R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.2)
RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 4.0.2)
Rcpp 1.0.6 2021-01-15 [1] CRAN (R 4.0.3)
readr * 1.4.0 2020-10-05 [1] CRAN (R 4.0.2)
readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.2)
reprex 1.0.0 2021-01-27 [1] CRAN (R 4.0.3)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.0.2)
rhdf5 2.32.4 2020-10-05 [1] Bioconductor
Rhdf5lib 1.10.1 2020-07-09 [1] Bioconductor
rlang 0.4.10 2020-12-30 [1] CRAN (R 4.0.2)
rmarkdown 2.6 2020-12-14 [1] CRAN (R 4.0.2)
rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.2)
rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.2)
Rttf2pt1 1.3.8 2020-01-10 [1] CRAN (R 4.0.2)
rvest 0.3.6 2020-07-25 [1] CRAN (R 4.0.2)
S4Vectors * 0.26.1 2020-05-16 [1] Bioconductor
scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.2)
see * 0.6.2 2021-02-04 [1] CRAN (R 4.0.3)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.2)
speedyseq * 0.5.3.9001 2020-10-27 [1] Github (mikemc/speedyseq@8daed32)
stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.2)
stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.2)
survival 3.2-7 2020-09-28 [1] CRAN (R 4.0.2)
svglite * 1.2.3.2 2020-07-07 [1] CRAN (R 4.0.2)
systemfonts 1.0.1 2021-02-09 [1] CRAN (R 4.0.3)
tibble * 3.0.6 2021-01-29 [1] CRAN (R 4.0.3)
tidyr * 1.1.2 2020-08-27 [1] CRAN (R 4.0.2)
tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.2)
tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 4.0.2)
vctrs 0.3.6 2020-12-17 [1] CRAN (R 4.0.2)
vegan * 2.5-7 2020-11-28 [1] CRAN (R 4.0.2)
viridisLite 0.3.0 2018-02-01 [1] CRAN (R 4.0.2)
vsn * 3.56.0 2020-04-27 [1] Bioconductor
webshot 0.5.2 2019-11-22 [1] CRAN (R 4.0.2)
withr 2.4.1 2021-01-26 [1] CRAN (R 4.0.3)
xfun 0.20 2021-01-06 [1] CRAN (R 4.0.2)
xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.2)
XVector * 0.28.0 2020-04-27 [1] Bioconductor
yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.2)
zlibbioc 1.34.0 2020-04-27 [1] Bioconductor
[1] /home/angel/R/library
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library